# load packages
suppressPackageStartupMessages({
library(dplyr)
library(ggplot2)
library(zoo)
library(xts)
library(tseries)
library(forecast)
library(TSA)
library(reshape2)
library(naniar)
library(lubridate)
library(Metrics)
})
# import data
raw.df <- read.csv("city_temperature.csv")
# remove observations with year = 200, 201 or 2020 (only partial data), Day = 0
wip.df <- raw.df %>%
filter(!Year %in% (200:201)) %>%
filter(Day != 0)
# date data type
wip.df <- wip.df %>%
mutate(Date = as.Date(with(wip.df,paste(Year,Month,Day,sep="-")),
"%Y-%m-%d")) %>%
dplyr::select(-c("Month","Day","Year"))
# number of days between the start and end date (inclusive)
n_days <- max(wip.df$Date) - min(wip.df$Date) + 1
Subset US cities
# list of US cities of interest
CitiesOfInterest <- c("Chicago",
"Honolulu",
"San Diego",
"Kansas City",
"Las Vegas",
"Los Angeles",
"Miami Beach",
"Minneapolis St. Paul",
"New York City",
"Phoenix",
"Raleigh Durham",
"San Antonio")
# subset US cities of interest
df.usa <- wip.df %>% dplyr::filter((Country=="US")
& (City %in% CitiesOfInterest)
& (State!="Additional Territories")) %>%
dplyr::select(-c("Region", "Country", "State")) # drop redundant columns
# reshape data set, one column per city
df_melt <- melt(df.usa, id.vars = c("Date","City"))
df.usa2 <- dcast(df_melt, Date ~ City)
# update values where `AvgTemperature` = -99 by forward filling
df.usa2 <- df.usa2 %>%
naniar::replace_with_na_all(condition = ~.x == -99) %>%
na.locf(fromLast = FALSE)
Create Test and Train Splits
# select Chicago
df <- df.usa2 %>% dplyr::select(c("Date","Chicago")) %>% rename(AvgTemp = "Chicago")
# ts of daily avg temp
ts.d <- df %>% dplyr::select(-c("Date")) %>% ts(start=c(1995,1), frequency=365)
# split test and train
train.d <- window(ts.d, end=c(2019,365)) ; autoplot(train.d)
test.d <- window(ts.d, start=c(2020,1)) ; autoplot(test.d)
# check length
length(ts.d) == n_days
## [1] TRUE
length(ts.d) == length(train.d) + length(test.d)
## [1] TRUE
# weekly avg temp
ts.w <- df %>%
# group by week, use week start date
group_by(WeekStart = lubridate::floor_date(df$Date,unit="week")) %>%
# calculate avg
summarize(AvgTemp = mean(AvgTemp) %>% round(2)) %>%
# drop "WeekStart" date before converting to ts
dplyr::select(-c("WeekStart")) %>%
# convert to time series object
ts(start=c(1995,1), frequency=52)
# split test and train
train.w <- window(ts.w, start=c(1995,1), end=c(2019,52)) ; autoplot(train.w)
test.w <- window(ts.w, start=c(2020,1)) ; autoplot(test.w)
# check length
length(ts.w) == length(train.w) + length(test.w)
## [1] TRUE
# monthly avg temp
ts.m <- df %>%
# group by YearMonth
group_by(YearMonth = as.yearmon(Date)) %>%
# calculate avg
summarize(AvgTemp = mean(AvgTemp) %>% round(2)) %>%
# drop "YearMonth" date before converting to ts
dplyr::select(-c("YearMonth")) %>%
# convert to time series object
ts(start=c(1995,1), frequency=12)
# split test and train
train.m <- window(ts.m, end=c(2019,12)) ; autoplot(train.m)
test.m <- window(ts.m, start=c(2020,1)) ; autoplot(test.m)
# check length
length(ts.m) == length(train.m) + length(test.m)
## [1] TRUE
#STL Decomposition
STL_D = stl(train.d[,1],s.window = "periodic")
plot(STL_D, main="Daily Decomposition")
STL_W = stl(train.w[,1],s.window = "periodic")
plot(STL_W, main="Weekly Decomposition")
STL_M = stl(train.m[,1],s.window = "periodic")
plot(STL_M, main="Monthly Decomposition")
#Set Frequency for TS for STL
ts.sd.w = ts(ts.w[1:1324,1],frequency = 52)
# split test and train
train.w.sd = ts.sd.w[1:1300]
tstrain.sd.w = ts(train.w.sd[1:1300],frequency = 52)
test.w.sd = ts.sd.w[1301:1324]
tstest.sd.w = ts(test.w.sd[1:24],frequency = 52)
# check length
length(ts.sd.w) == length(train.w.sd) + length(test.w.sd)
## [1] TRUE
MODELING
#STL on dataset
STL_SD_W = stl(ts.sd.w,s.window="periodic")
fcst_sd_w = forecast(STL_SD_W,h=length(test.w.sd))
plot(fcst_sd_w)
checkresiduals(fcst_sd_w)
## Warning in checkresiduals(fcst_sd_w): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 233.52, df = 102, p-value = 2.577e-12
##
## Model df: 2. Total lags used: 104
Box.test(fcst_sd_w,type="Ljung-Box")
##
## Box-Ljung test
##
## data: fcst_sd_w
## X-squared = 10.435, df = 1, p-value = 0.001237
ts.plot(residuals(fcst_sd_w),ylab="Residuals",xlab="",main="Residuals from STL Model")
acf(residuals(fcst_sd_w),main="ACF of STL Residuals")
#STL on train dataset
STL_SD_W_train = stl(train.w[,1],s.window = "periodic")
plot(STL_SD_W_train, main="STL Train Decomposition")
fcst_sd_w_train = forecast(STL_SD_W_train,h=length(test.w))
plot(fcst_sd_w_train,main="Forecasts from STL and ARIMA")
#Check Accuracy
acc_train = forecast::accuracy(fcst_sd_w_train,test.w)
acc_train
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01646531 6.013044 4.588315 -5.09680 14.93285 0.6917317
## Test set -3.99185563 8.902500 7.484941 -10.17876 19.56353 1.1284254
## ACF1 Theil's U
## Training set 0.1392196 NA
## Test set 0.5301805 1.183986
#SMAPE
smape_train = smape(train.w,fcst_sd_w_train$residuals)
smape_test = smape(test.w,fcst_sd_w_train$mean)
#Residuals
checkresiduals(fcst_sd_w_train)
## Warning in checkresiduals(fcst_sd_w_train): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 236.6, df = 102, p-value = 1.054e-12
##
## Model df: 2. Total lags used: 104
#Ljung Box Test
Box.test(fcst_sd_w_train,type="Ljung-Box")
##
## Box-Ljung test
##
## data: fcst_sd_w_train
## X-squared = 10.49, df = 1, p-value = 0.0012
ts.plot(residuals(fcst_sd_w_train),ylab="Residuals",xlab="",main="Residuals from STL Train Model")
acf(residuals(fcst_sd_w_train),main="ACF of STL Train Residuals")
#STL on train dataset V1
STL_SD_W_train_v1 = stl(train.w[,1],s.window = 10,t.window=13)
plot(STL_SD_W_train_v1, main="STL Train Decomposition")
fcst_sd_w_train_v1 = forecast(STL_SD_W_train_v1,h=length(test.w))
plot(fcst_sd_w_train_v1,main="Forecasts from STL and ARIMA")
#Check Accuracy
acc_train_v1 = forecast::accuracy(fcst_sd_w_train_v1,test.w)
acc_train
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01646531 6.013044 4.588315 -5.09680 14.93285 0.6917317
## Test set -3.99185563 8.902500 7.484941 -10.17876 19.56353 1.1284254
## ACF1 Theil's U
## Training set 0.1392196 NA
## Test set 0.5301805 1.183986
#SMAPE
smape_train_v1 = smape(train.w,fcst_sd_w_train_v1$residuals)
smape_test_v1 = smape(test.w,fcst_sd_w_train_v1$mean)
#Residuals
checkresiduals(fcst_sd_w_train_v1)
## Warning in checkresiduals(fcst_sd_w_train_v1): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 388.83, df = 102, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 104
#Ljung Box Test
Box.test(fcst_sd_w_train_v1,type="Ljung-Box")
##
## Box-Ljung test
##
## data: fcst_sd_w_train_v1
## X-squared = 9.893, df = 1, p-value = 0.001659
ts.plot(residuals(fcst_sd_w_train_v1),ylab="Residuals",xlab="",main="Residuals from STL Train Model")
acf(residuals(fcst_sd_w_train_v1),main="ACF of STL Train Residuals")
#MSTL on train dataset
STL_SD_W_train_v2 = mstl(train.w[,1],s.window = 13)
plot(STL_SD_W_train_v2, main="STL Train Decomposition")
fcst_sd_w_train_v2 = forecast(STL_SD_W_train_v2,h=length(test.w))
plot(fcst_sd_w_train_v2,main="Forecasts from STL and ARIMA")
#Check Accuracy
acc_train_v2= forecast::accuracy(fcst_sd_w_train_v2,test.w)
acc_train
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01646531 6.013044 4.588315 -5.09680 14.93285 0.6917317
## Test set -3.99185563 8.902500 7.484941 -10.17876 19.56353 1.1284254
## ACF1 Theil's U
## Training set 0.1392196 NA
## Test set 0.5301805 1.183986
#SMAPE
smape_train_v2 = smape(train.w,fcst_sd_w_train_v2$residuals)
smape_test_v2 = smape(test.w,fcst_sd_w_train_v2$mean)
#Residuals
checkresiduals(fcst_sd_w_train_v2)
## Warning in checkresiduals(fcst_sd_w_train_v2): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 346.19, df = 102, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 104
#Ljung Box Test
Box.test(fcst_sd_w_train_v2,type="Ljung-Box")
##
## Box-Ljung test
##
## data: fcst_sd_w_train_v2
## X-squared = 9.221, df = 1, p-value = 0.002393
ts.plot(residuals(fcst_sd_w_train_v2),ylab="Residuals",xlab="",main="Residuals from STL Train Model")
acf(residuals(fcst_sd_w_train_v2),main="ACF of STL Train Residuals")